header <- c('entry', 'enzyme', 'Km',
            'accession', 'KEGG', 'binomial',
            'type', 'reference')

oxidases <- readxl::read_xlsx('HBH_table_v3.xlsx', 
                              col_names = header,
                              skip = 1) %>%
  mutate(across(c(entry, accession, KEGG, binomial, type), factor))
ko <- read_delim('ko.csv', ';')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   KEGG = col_character(),
##   definition = col_character()
## )
oxidases <- oxidases %>%
  mutate(across(c('binomial', 'KEGG'), ~ fct_relevel(.x, sort))) %>%
  mutate(genus = str_extract(binomial, '[^ ]+')) %>%
  left_join(ko, 'KEGG')
taxon_levels <- c('superkingdom', 'phylum', 'class', 'order', 'family', 'genus')
genera <- oxidases %>% 
  pull(genus) %>%
  as.character() %>%
  sort() %>%
  unique()
oxidases <- oxidases %>%
  mutate(EC = .$definition %>% 
           str_extract('EC:[^\\]]+') %>% 
           str_remove('EC:'),
         EC_f = EC %>%
           str_extract('[0-9]\\.[0-9]+\\.[0-9]+'),
         EC_f = ifelse(!is.na(EC_f), paste0(EC_f, '.-'), EC_f),
         EC_sf = EC %>%
           str_extract('[0-9]\\.[0-9]+'),
         EC_sf = ifelse(!is.na(EC_sf), paste0(EC_sf, '.-.-'), EC_sf))
# taxonomy_map <- genera %>% 
#   purrr::map(~ myTAI::taxonomy(organism = .x, db = 'ncbi', output = 'classification'))
# saveRDS(taxonomy_map, 'taxonomy_map.rds')

taxonomy_map <- readRDS('taxonomy_map.rds')
taxonomy_data <- taxonomy_map %>%
  purrr::map_dfr(~ .x %>% 
                   filter(rank %in% taxon_levels) %>% 
                   select(-id) %>%
                   pivot_wider(names_from = rank, 
                               values_from = name)) %>%
  select(all_of(taxon_levels))

oxidases <- oxidases %>%
  left_join(taxonomy_data, by = 'genus')
oxidases <- oxidases %>%
  mutate(subtype = type) %>%
  mutate(subtype = case_when(
    grepl('di[)]*oxygenase', definition, perl=T, ignore.case = T) ~ 'dioxygenase',
    grepl('mono[-]*oxygenase', definition, perl=T, ignore.case = T) ~ 'monooxygenase',
    grepl('hydroxylase', definition, perl=T, ignore.case = T) ~ 'hydroxylase',
    type == 'terminal' ~ 'terminal',
    TRUE ~ 'other',
  )) %>%
  dplyr::distinct()
types <- oxidases %>%
  pull(type) %>%
  levels()
color_map <- c('royalblue', 'goldenrod')
names(color_map) <- types
logbreaks <- c(0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0)
loglabels <- c('0.001', '0.01', '0.10', '1.0', '10.0', '100.0', '1000.0')
oxidases %>%
  ggplot(aes(x = Km, color = type))+
  geom_density() +
  scale_x_continuous(trans = reverselog_trans(10),
                     breaks = logbreaks,
                     labels = loglabels) +
  scale_color_manual(values = color_map) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  theme_bw()

p1 <- oxidases %>%
  ggplot(aes(x = Km, y = type, fill = type, color = type )) +
  geom_boxplot(outlier.shape = 21) +
  scale_x_continuous(trans = reverselog_trans(10),
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_fill_manual(values = color_map,
                    name = 'oxidase type',
                    guide = guide_legend(reverse=TRUE),
                    aesthetics = c('color', 'fill')) +
  theme_minimal() +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        # legend.title = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(colour="darkgrey"),
        axis.ticks.x = element_line(colour="darkgrey"),
        panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        ) +
  stat_summary(geom = "crossbar",
               show.legend = F,
               width=0.65,
               fatten=0,
               color="white",
               fun.data = function(x) {
                 return(c(y=median(x),
                          ymin=median(x),
                          ymax=median(x)))
                 }) +
  theme(legend.position = c(0.20, 0.75))
save_plot('oxidase_boxplot_style1.png', p1)

p1.1 <- oxidases %>%
  ggplot(aes(x = Km, y = type, fill = type), color = 'gray') +
  geom_boxplot(outlier.shape = 21) +
  scale_x_continuous(trans = reverselog_trans(10),
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_fill_manual(values = color_map,
                    name = 'oxidase type',
                    guide = guide_legend(reverse=TRUE),
                    aesthetics = c('fill')) +
  theme_bw() +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.y = element_blank())

p1.2 <- p1.1 + annotate('text', x = 0.025, y = 0.5, label = 'p-value < 2e-16')

# Box cox transform and natural log transform produce the same result
oxidases.lmer <- oxidases %>% 
  group_by(type) %>%
  mutate(Km_bc = car::bcPower(Km, 0),
         Km_log = log(Km)) %>%
  filter(!is.na(accession))
oxidases.lmer %>% select(Km_bc, Km_log)
## Adding missing grouping variables: `type`
## # A tibble: 333 x 3
## # Groups:   type [2]
##    type  Km_bc Km_log
##    <fct> <dbl>  <dbl>
##  1 other  4.83   4.83
##  2 other  4.69   4.69
##  3 other  4.01   4.01
##  4 other  5.19   5.19
##  5 other  2.46   2.46
##  6 other  2.30   2.30
##  7 other  7.00   7.00
##  8 other  3.22   3.22
##  9 other  4.25   4.25
## 10 other  3.33   3.33
## # … with 323 more rows
oxidases.lmer %>% filter(type == 'terminal')  %>%
  ggplot(aes(sample = Km_bc)) + stat_qq() + stat_qq_line()

lmer_res <- lmerTest::lmer(Km_log ~ type * (1 | accession ),
                           oxidases.lmer,
                           REML = F)
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
lmerTest::ranova(lmer_res) # Random effect makes fit better
## ANOVA-like table for random-effects: Single term deletions
## 
## Model:
## Km_log ~ type + (1 | accession)
##                 npar  logLik    AIC    LRT Df Pr(>Chisq)    
## <none>             4 -615.52 1239.0                         
## (1 | accession)    3 -666.54 1339.1 102.04  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmer_res.summary <- lmer_res %>% summary()
lmer_res.summary # Terminal & Other are significantly different fits
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: Km_log ~ type * (1 | accession)
##    Data: oxidases.lmer
## 
##      AIC      BIC   logLik deviance df.resid 
##   1239.0   1254.3   -615.5   1231.0      329 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.45779 -0.50634  0.01752  0.48424  2.87124 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  accession (Intercept) 1.751    1.323   
##  Residual              1.513    1.230   
## Number of obs: 333, groups:  accession, 126
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)    4.3345     0.1697 116.9544   25.55   <2e-16 ***
## typeterminal  -4.9265     0.3340 128.7363  -14.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## typeterminl -0.508
p1.1 %>% save_plot('oxidase_boxplot_style2.png', .)
p1.2 %>% save_plot('oxidase_boxplot_style2_pval.png', .)
p1.2 %>% save_plot('oxidase_boxplot_style2_pval.svg', .)
p_phylum <- oxidases %>% ggplot(aes(y = Km, x = phylum, fill = phylum)) +
  geom_boxplot() +
  facet_wrap(~type, ncol=1, scales = 'free_y') +
  scale_y_continuous(trans = reverselog_trans(10),
                     breaks = logbreaks,
                     labels = loglabels) + 
  scale_fill_brewer(palette = 'Set1') + 
  coord_flip() + 
  cowplot::theme_cowplot()
cowplot::save_plot('boxplot_phylum.png', p_phylum, base_height=5)

message('Figure 1')
## Figure 1
p1.2

p2 <- oxidases %>%
  ggplot(aes(x = Km, group = type, fill = type)) +
  geom_density(alpha=.6) +
  scale_x_continuous(trans = reverselog_trans(10),
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_fill_manual(values = color_map,
                    name = 'oxidase type',
                    guide = guide_legend(reverse=TRUE)) +
  theme_bw() +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        panel.grid.minor = element_blank(),
        panel.grid.major.y = element_blank(),
        legend.position = c(0.85,0.75),
        legend.box.background = element_rect(color = 'white'),
  )
p3 <- oxidases %>%
  ggplot(aes(x = Km, y = type, fill = type)) +
  ggridges::geom_density_ridges(alpha=.75, rel_min_height = 0.01) +
  scale_x_continuous(trans = reverselog_trans(10),
                     breaks = logbreaks,
                     labels = loglabels,
                     expand = c(0,0)) +
  scale_y_discrete(expand = c(0,0)) +
  coord_cartesian(clip = 'off') +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_fill_manual(values = color_map,
                    name = 'oxidase type',
                    guide = guide_legend(reverse=TRUE)) +
  ggridges::theme_ridges() +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        legend.position = c(0.05,0.85),
        legend.box.background = element_rect(color = 'white'))
p2 %>% save_plot('oxidase_density_plot_cb.png', .)
p3 %>% save_plot('oxidase_ridge_plot.png', .)
## Picking joint bandwidth of 0.284
p2

p3
## Picking joint bandwidth of 0.284

(oxidases %>%
  ggplot(aes(y = genus, x = Km, color = type, label = KEGG, text = definition)) +
  geom_point() +
  scale_x_continuous(trans = reverselog_trans(10), 
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_color_manual(values = color_map) +
  theme_bw()) %>%
  plotly::ggplotly()
oxidases %>%
  filter(!is.na(superkingdom)) %>%
  ggplot(aes(y = superkingdom, x = Km, fill = type)) +
  geom_boxplot() +
  scale_x_continuous(trans = reverselog_trans(10), 
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_fill_manual(values = color_map) +
  theme_bw()

oxidases %>%
  filter(type == 'terminal') %>%
  pull(definition) %>% 
  table()
## .
## cytochrome aa3-600 menaquinol oxidase subunit I [EC:7.1.1.5] 
##                                                            3 
##       cytochrome bd ubiquinol oxidase subunit I [EC:7.1.1.7] 
##                                                           20 
##        cytochrome c oxidase cbb3-type subunit I [EC:7.1.1.9] 
##                                                           11 
##                  cytochrome c oxidase subunit 1 [EC:7.1.1.9] 
##                                                           11 
##                  cytochrome c oxidase subunit I [EC:7.1.1.9] 
##                                                            8 
##        cytochrome o ubiquinol oxidase subunit I [EC:7.1.1.3] 
##                                                           21
(oxidases %>%
  ggplot(aes(y = KEGG, x = Km, color = type, label = binomial, text = definition)) +
  geom_point() +
  scale_x_continuous(trans = reverselog_trans(10), 
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_color_manual(values = color_map) +
  theme_bw()) %>%
  plotly::ggplotly()
oxidases %>%
  filter(type == 'terminal') %>%
  pull(definition) %>%
  table()
## .
## cytochrome aa3-600 menaquinol oxidase subunit I [EC:7.1.1.5] 
##                                                            3 
##       cytochrome bd ubiquinol oxidase subunit I [EC:7.1.1.7] 
##                                                           20 
##        cytochrome c oxidase cbb3-type subunit I [EC:7.1.1.9] 
##                                                           11 
##                  cytochrome c oxidase subunit 1 [EC:7.1.1.9] 
##                                                           11 
##                  cytochrome c oxidase subunit I [EC:7.1.1.9] 
##                                                            8 
##        cytochrome o ubiquinol oxidase subunit I [EC:7.1.1.3] 
##                                                           21
(oxidases %>%
   filter(type == 'terminal') %>%
  ggplot(aes(y = definition, x = Km, color = class, label = binomial, text = definition)) +
  geom_point() +
  scale_x_continuous(trans = reverselog_trans(10), 
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_color_brewer(type = 'qual') +
  theme_bw()) %>%
  plotly::ggplotly()
headers <- c('KEGG', 'gene_name', 'annotation', 'E_Score', 'P_Score')
enzclass.sf <- read_tsv('enzclass.tsv', col_names = c('EC_sf', 'Definition_sf'), skip = 1)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   EC_sf = col_character(),
##   Definition_sf = col_character()
## )
enzclass.f <- read_tsv('enzclass.tsv', col_names = c('EC_f', 'Definition_f'), skip = 1)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   EC_f = col_character(),
##   Definition_f = col_character()
## )
oxidases <- oxidases %>%
  left_join(enzclass.sf, by = 'EC_sf') %>%
  left_join(enzclass.f, by = 'EC_f') 
oxidases %>%
  datatable(extensions = 'Buttons',
          options = list(dom = 'Blfrtip',
                         scrollX = TRUE,
                         buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
                         lengthMenu = list(c(10,25,50,-1),
                                           c(10,25,50,"All"))),
          rownames = FALSE)
EC_count <- oxidases %>%
  pull(EC_sf) %>%
  n_distinct()

(oxidases %>%
   filter(type == 'other') %>%
  ggplot(aes(y = KEGG, x = Km, color = EC_sf, label = binomial, text = definition)) +
  geom_point() +
  scale_x_continuous(trans = reverselog_trans(10), 
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_color_manual(values = colorRampPalette(brewer.pal(8, 'Set1'))(EC_count), 
                     na.value = '#000000') +
  theme_bw()) %>%
  plotly::ggplotly()
EC_count <- oxidases %>%
  pull(EC_f) %>%
  n_distinct()

oxidases %>%
  select(EC_f, Definition_f) %>%
  group_by(EC_f) %>%
  count()
## # A tibble: 19 x 2
## # Groups:   EC_f [19]
##    EC_f          n
##    <chr>     <int>
##  1 1.1.3.-      20
##  2 1.1.99.-      2
##  3 1.10.3.-     11
##  4 1.13.11.-    94
##  5 1.13.12.-     8
##  6 1.14.11.-    22
##  7 1.14.12.-     9
##  8 1.14.13.-     5
##  9 1.14.16.-    21
## 10 1.14.17.-     5
## 11 1.14.99.-    10
## 12 1.17.1.-      2
## 13 1.18.1.-      2
## 14 1.3.3.-       2
## 15 1.4.3.-      15
## 16 1.5.3.-       5
## 17 1.7.3.-       3
## 18 7.1.1.-      74
## 19 <NA>         23
(oxidases %>%
   filter(type == 'other') %>%
  ggplot(aes(y = KEGG, x = Km, color = EC_f, label = binomial, text = definition)) +
  geom_point() +
  scale_x_continuous(trans = reverselog_trans(10), 
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  scale_color_manual(values = colorRampPalette(brewer.pal(8, 'Set1'))(EC_count), 
                     na.value = '#000000') +
  theme_bw()) %>%
  plotly::ggplotly()
source('./xgfs_colors.R')
oxidases %>% select(enzyme, definition, subtype)
## # A tibble: 333 x 3
##    enzyme                           definition                         subtype  
##    <chr>                            <chr>                              <chr>    
##  1 4,5-PCD (protocatechuate 4,5-di… protocatechuate 4,5-dioxygenase, … dioxygen…
##  2 Plant -dioxygenase (PADOX-1)     alpha-dioxygenase [EC:1.14.99.-]   dioxygen…
##  3 GenDOPa (gentisate 1,2-dioxygen… gentisate 1,2-dioxygenase [EC:1.1… dioxygen…
##  4 3,4-Dihydroxy-9,10-secoandrosta… 3,4-dihydroxy-9,10-secoandrosta-1… dioxygen…
##  5 catechol 1,2-dioxygenase         catechol 1,2-dioxygenase [EC:1.13… dioxygen…
##  6 catechol 1,2-dioxygenase         catechol 1,2-dioxygenase [EC:1.13… dioxygen…
##  7 OsARD1 (aci-reductone dioxygena… 1,2-dihydroxy-3-keto-5-methylthio… dioxygen…
##  8 2-monooxygenase                  phenol/toluene 2-monooxygenase (N… monooxyg…
##  9 2,3-dioxygenase                  benzene/toluene/chlorobenzene dio… dioxygen…
## 10 xylene monooxygenase             p-cymene methyl-monooxygenase ele… monooxyg…
## # … with 323 more rows
subtypes <- oxidases %>%
  pull(subtype) %>%
  as.factor() %>%
  levels()

oxidases.other <- oxidases %>%
  filter(subtype != 'terminal') %>%
  mutate(subtype = factor(subtype))
subtype_labels.df <- oxidases.other %>%
  count(subtype) %>%
  mutate(ylabels = paste0(subtype, ' (n=', n, ')'))
subtype_labels <- subtype_labels.df$ylabels
names(subtype_labels) <- subtype_labels.df$subtype
(oxidases.other %>%
  ggplot(aes(y = KEGG, x = Km, color = subtype, label = binomial, text = definition)) +
  geom_point() +
  scale_x_continuous(trans = reverselog_trans(10), 
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  # scale_color_manual(values = xgfs.6) +
  scale_color_manual(values = cb_pal.12) +
  theme_bw()) %>%
  plotly::ggplotly()
oxidases.other %>%
  ggplot(aes(y = subtype, x = Km, fill = subtype)) +
  scale_y_discrete(limits=rev(levels(oxidases.other$subtype[1:4])),
                   labels=subtype_labels[1:4]) +
  geom_boxplot(color='black') +
  scale_x_continuous(trans = reverselog_trans(10), 
                     breaks = logbreaks,
                     labels = loglabels) +
  geom_vline(xintercept = 25, linetype = 'dashed') +
  theme_bw() +
  scale_fill_manual(values = cb_pal.12)

# oxidases %>% pull(Definition_sf) %>% table()
oxidases.summary <- oxidases %>% 
  group_by(type) %>%
  summarise(count = n(),
            median = median(Km),
            min = min(Km),
            max = max(Km),
            IQR = IQR(Km),
  )
## `summarise()` ungrouping output (override with `.groups` argument)
oxidases.summary %>%
  write_tsv('oxidase_summary.tsv')
oxidases.summary
## # A tibble: 2 x 6
##   type     count median   min    max    IQR
##   <fct>    <int>  <dbl> <dbl>  <dbl>  <dbl>
## 1 other      259 60     0.3   3600   169.  
## 2 terminal    74  0.365 0.003   20.8   2.39